package com.business;

//import java.io.File;
import java.util.ArrayList;

import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * @author bob yang
 * @date 2018年3月26日 TODO get infomation from website
 */
public class InfoCrawler
{
	private static Logger logger = Logger.getLogger(InfoCrawler.class);
			
	public static ArrayList<InfoBean> getInfo() throws Exception
	{
		ArrayList<InfoBean> infoList = new ArrayList<InfoBean>();
		try
		{
			// 只爬取前3页的信息
			for (int page = 1; page <= 3; page++)
			{
				String url = "http://yz.tongji.edu.cn/html/zsxw/sszs/" + page + ".html";
				Document doc = Jsoup.connect(url).get();
//             注释内容为本地测试文件				
//				String path0 = "E:/";
//				File input = new File(path0+"同济研究生招生网_同济研究生招生网"+page+".html");
//				Document doc = Jsoup.parse(input, "UTF-8", "");
				/*
				 * 解析网页时（class=list_main_content）只有一个，故取第一个, 再取其子节点下的<ul>，将<ul>的每一行(<li>)变为Elements
				 */
				Elements infoDetail = doc.getElementsByClass("list_main_content").get(0).children().get(0).children();
				for (int i = 0; i <= infoDetail.size() - 1; i++)
				{
					InfoBean ifb = new InfoBean();
					Element oneIfEm = infoDetail.get(i);
					ifb.setInfo(oneIfEm.getElementsByTag("a").text());
					ifb.setDate(oneIfEm.getElementsByClass("list_time").text());
					ifb.setHref("http://yz.tongji.edu.cn"+oneIfEm.getElementsByTag("a").attr("href"));
					infoList.add(ifb);
				}
			}
		}
		catch (Exception e)
		{
			// TODO: handle exception
			logger.error("InfoCrawler.java"+"-------->"+e.toString());
			throw new RuntimeException("爬取网页信息失败，请检查后台");
		}
		return infoList;
	}

//	public static void main(String args[])
// /* main方法用于调试 */
//	{
//		InfoCrawler ic = new InfoCrawler();
//		try
//		{
//			ArrayList<InfoBean> list = ic.getInfo();
//			System.out.println(list.size());
//		}
//		catch (Exception e)
//		{
//			// TODO: handle exception
//			System.out.println(e.getMessage());
//		}
//
//	}
}
